Loading packages:
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#install.packages("ggh4x")
install.packages("ggpointdensity", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/0_/80b5wwrn45g63fjxqr25xmpr0000gn/T//RtmpDxxCDb/downloaded_packages
#library(ggh4x)
library(ggpointdensity)
#library(ggplot2)
Read in the data
df_all <- readr::read_csv("final_project_train.csv", col_names = TRUE)
## Rows: 677 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): region, customer, outcome
## dbl (35): rowid, xb_01, xb_02, xb_03, xn_01, xn_02, xn_03, xa_01, xa_02, xa_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_all %>% glimpse()
## Rows: 677
## Columns: 38
## $ rowid <dbl> 1, 3, 4, 5, 8, 9, 11, 14, 15, 16, 17, 18, 19, 22, 24, 25, 27,…
## $ region <chr> "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "…
## $ customer <chr> "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "…
## $ xb_01 <dbl> 4.000000, 1.000000, 2.000000, 2.520000, 2.548387, 3.071429, 3…
## $ xb_02 <dbl> 4, 1, 2, 11, 6, 6, 10, 12, 9, 10, 8, 10, 10, 8, 6, 10, 13, 10…
## $ xb_03 <dbl> 4, 1, 2, -6, -1, 1, -4, -4, -2, -4, -2, -2, -2, -4, 1, -4, -3…
## $ xn_01 <dbl> 3.0000000, 2.0000000, 2.0000000, 1.5333333, 0.8387097, 1.8571…
## $ xn_02 <dbl> 3, 2, 4, 9, 3, 8, 6, 10, 10, 4, 6, 8, 9, 5, 7, 12, 12, 6, 6, …
## $ xn_03 <dbl> 3, 2, 0, -3, -4, -2, -5, -6, -3, -5, -3, -6, -4, -3, 0, -5, -…
## $ xa_01 <dbl> 12.000000, 3.000000, 9.000000, 7.080000, 6.451613, 6.857143, …
## $ xa_02 <dbl> 12, 3, 9, 29, 17, 18, 24, 27, 20, 19, 15, 24, 24, 15, 14, 26,…
## $ xa_03 <dbl> 12, 3, 9, -7, -2, 2, -9, -5, -3, -3, -1, 1, -2, -3, 3, -4, -5…
## $ xb_04 <dbl> 1.3333333, 1.0000000, 1.0000000, 0.8950476, 1.2247312, 1.1857…
## $ xb_05 <dbl> 1.3333333, 1.0000000, 1.0000000, -2.0000000, -0.5000000, 0.00…
## $ xb_06 <dbl> 1.333333, 1.000000, 1.000000, 4.000000, 4.000000, 3.000000, 6…
## $ xb_07 <dbl> 4.000000, 1.000000, 2.000000, 1.933333, 1.967742, 1.714286, 1…
## $ xb_08 <dbl> -1.00000000, 1.00000000, 0.00000000, -0.08000000, 0.35483871,…
## $ xn_04 <dbl> 1.0000000, 2.0000000, 1.0000000, 0.5268889, 0.4688172, 0.5607…
## $ xn_05 <dbl> 1.0000000, 2.0000000, 0.0000000, -1.0000000, -1.3333333, -1.0…
## $ xn_06 <dbl> 1.0, 2.0, 2.0, 2.5, 3.0, 2.0, 4.0, 4.0, 3.0, 2.0, 2.0, 2.5, 2…
## $ xn_07 <dbl> 3.000000, 2.000000, 2.500000, 1.493333, 1.225806, 1.642857, 1…
## $ xn_08 <dbl> -1.0000000, 2.0000000, -1.0000000, -0.4400000, -0.4516129, -0…
## $ xa_04 <dbl> 6.000000, 3.000000, 6.750000, 2.425333, 3.023656, 2.685714, 2…
## $ xa_05 <dbl> 6.0000000, 3.0000000, 4.5000000, -3.5000000, -0.6666667, 0.40…
## $ xa_06 <dbl> 6.000000, 3.000000, 9.000000, 9.000000, 13.000000, 6.000000, …
## $ xa_07 <dbl> 9.000000, 3.000000, 7.500000, 4.466667, 4.612903, 4.071429, 4…
## $ xa_08 <dbl> 3.0000000, 3.0000000, 6.0000000, 0.7066667, 1.3225806, 1.3571…
## $ xw_01 <dbl> 23.00000, 17.00000, 52.50000, 64.52564, 54.75758, 58.33333, 6…
## $ xw_02 <dbl> 23, 17, 48, 0, 12, 15, 0, 0, 0, 7, 14, 0, 0, 0, 8, 8, 0, 4, 2…
## $ xw_03 <dbl> 23, 17, 57, 106, 105, 101, 107, 109, 109, 104, 109, 99, 103, …
## $ xs_01 <dbl> 0.262073307, 0.330804757, 0.239795763, 0.142106837, 0.2442957…
## $ xs_02 <dbl> 0.26207331, 0.33080476, 0.19049123, -0.73321509, -0.12204299,…
## $ xs_03 <dbl> 0.2620733, 0.3308048, 0.2891003, 0.5500723, 1.3134719, 0.6540…
## $ xs_04 <dbl> 0.5375576, 0.4286607, 0.3676937, 0.2865445, 0.2375470, 0.2594…
## $ xs_05 <dbl> 0.5375575604, 0.4286607050, 0.2485001680, 0.0000000000, 0.043…
## $ xs_06 <dbl> 0.5375576, 0.4286607, 0.4868872, 0.6357541, 0.4327004, 0.8672…
## $ response <dbl> 2.617991, 1.184632, 2.216626, 2.726715, 1.483323, 2.039279, 1…
## $ outcome <chr> "non_event", "non_event", "event", "non_event", "non_event", …
High level summary of the data. It is important to note that rowid, region, customer are categorical inputs. All of the sentiment derived features that begin with x are continuous inputs. Additionally, response is the continuous output, and outcome is the categorical output.
df_all %>% summary()
## rowid region customer xb_01
## Min. : 1.0 Length:677 Length:677 Min. :-4.000
## 1st Qu.: 312.0 Class :character Class :character 1st Qu.: 2.333
## Median : 647.0 Mode :character Mode :character Median : 3.250
## Mean : 648.2 Mean : 3.377
## 3rd Qu.: 972.0 3rd Qu.: 4.250
## Max. :1324.0 Max. :14.000
## xb_02 xb_03 xn_01 xn_02
## Min. :-4.000 Min. :-7.000 Min. :-4.0000 Min. :-4.000
## 1st Qu.: 3.000 1st Qu.:-1.000 1st Qu.: 0.7917 1st Qu.: 2.000
## Median : 6.000 Median : 1.000 Median : 1.6000 Median : 4.000
## Mean : 5.749 Mean : 1.217 Mean : 1.5581 Mean : 3.665
## 3rd Qu.: 8.000 3rd Qu.: 3.000 3rd Qu.: 2.4000 3rd Qu.: 6.000
## Max. :15.000 Max. :14.000 Max. :10.0000 Max. :13.000
## xn_03 xa_01 xa_02 xa_03
## Min. :-7.0000 Min. :-3.000 Min. :-3.00 Min. :-9.000
## 1st Qu.:-2.0000 1st Qu.: 6.000 1st Qu.: 8.00 1st Qu.: 0.000
## Median :-1.0000 Median : 8.000 Median :13.00 Median : 3.000
## Mean :-0.4018 Mean : 8.073 Mean :13.24 Mean : 3.836
## 3rd Qu.: 1.0000 3rd Qu.: 9.750 3rd Qu.:18.00 3rd Qu.: 7.000
## Max. :10.0000 Max. :35.000 Max. :38.00 Max. :35.000
## xb_04 xb_05 xb_06 xb_07
## Min. :-2.000 Min. :-3.0000 Min. :-2.000 Min. :-1.000
## 1st Qu.: 0.850 1st Qu.:-0.3333 1st Qu.: 1.200 1st Qu.: 1.667
## Median : 1.138 Median : 0.4000 Median : 2.000 Median : 2.000
## Mean : 1.153 Mean : 0.4079 Mean : 2.107 Mean : 2.097
## 3rd Qu.: 1.428 3rd Qu.: 1.0000 3rd Qu.: 3.000 3rd Qu.: 2.500
## Max. : 5.000 Max. : 5.0000 Max. : 9.000 Max. : 7.000
## xb_08 xn_04 xn_05 xn_06
## Min. :-4.0000 Min. :-4.0000 Min. :-4.0000 Min. :-4.000
## 1st Qu.:-0.2500 1st Qu.: 0.2678 1st Qu.:-1.0000 1st Qu.: 0.800
## Median : 0.2051 Median : 0.6000 Median :-0.2500 Median : 1.250
## Mean : 0.2124 Mean : 0.6038 Mean :-0.1584 Mean : 1.479
## 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.: 0.5000 3rd Qu.: 2.000
## Max. : 5.0000 Max. : 5.0000 Max. : 5.0000 Max. : 7.000
## xn_07 xn_08 xa_04 xa_05
## Min. :-4.000 Min. :-4.00000 Min. :-2.000 Min. :-8.000
## 1st Qu.: 1.000 1st Qu.:-1.00000 1st Qu.: 2.252 1st Qu.: 0.000
## Median : 1.400 Median :-0.30769 Median : 2.925 Median : 1.333
## Mean : 1.406 Mean :-0.26713 Mean : 2.945 Mean : 1.380
## 3rd Qu.: 1.833 3rd Qu.: 0.03704 3rd Qu.: 3.500 3rd Qu.: 2.667
## Max. : 5.000 Max. : 5.00000 Max. :12.000 Max. :12.000
## xa_06 xa_07 xa_08 xw_01
## Min. :-2.000 Min. :-2.000 Min. :-5.000 Min. : 9.00
## 1st Qu.: 3.000 1st Qu.: 3.882 1st Qu.: 0.400 1st Qu.: 44.36
## Median : 4.333 Median : 4.613 Median : 1.140 Median : 57.41
## Mean : 5.149 Mean : 4.699 Mean : 1.221 Mean : 57.02
## 3rd Qu.: 6.500 3rd Qu.: 5.400 3rd Qu.: 2.000 3rd Qu.: 67.50
## Max. :23.000 Max. :13.000 Max. :12.000 Max. :108.00
## xw_02 xw_03 xs_01 xs_02
## Min. : 0.00 Min. : 9.00 Min. :-0.3612 Min. :-0.89585
## 1st Qu.: 9.00 1st Qu.: 58.00 1st Qu.: 0.1449 1st Qu.:-0.14236
## Median : 24.00 Median : 93.00 Median : 0.2160 Median : 0.03546
## Mean : 31.87 Mean : 79.07 Mean : 0.2148 Mean : 0.02228
## 3rd Qu.: 49.00 3rd Qu.:101.00 3rd Qu.: 0.2839 3rd Qu.: 0.19274
## Max. :108.00 Max. :113.00 Max. : 0.7548 Max. : 0.69105
## xs_03 xs_04 xs_05 xs_06
## Min. :-0.3612 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.: 0.2412 1st Qu.:0.2438 1st Qu.:0.07934 1st Qu.:0.3040
## Median : 0.3870 Median :0.2908 Median :0.16213 Median :0.4324
## Mean : 0.4241 Mean :0.3011 Mean :0.18863 Mean :0.4666
## 3rd Qu.: 0.5940 3rd Qu.:0.3429 3rd Qu.:0.26336 3rd Qu.:0.5948
## Max. : 1.7907 Max. :0.8988 Max. :0.89883 Max. :1.3088
## response outcome
## Min. : 0.5725 Length:677
## 1st Qu.: 1.5615 Class :character
## Median : 2.2896 Mode :character
## Mean : 2.6756
## 3rd Qu.: 3.2764
## Max. :22.9219
Categorical Visualization of Counts
df_all %>% ggplot(mapping=aes(x=customer)) + geom_bar()
df_all %>% ggplot(mapping=aes(x=region)) + geom_bar()
df_all %>% ggplot(mapping=aes(x=outcome)) + geom_bar()
Continuous Visualizations:
The AFINN derived features look gaussian like.
df_all_pivot_xa <- df_all %>% select(starts_with("xa")) %>% rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xa %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = 1) + facet_wrap(~name, scales = "free")
df_all_pivot_xa %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')
df_graph <- df_all %>% select(starts_with("xa"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer"))
df_graph %>% ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
color = as.factor(region)), size = 1.2, adjust = 1.35) +
facet_wrap(~ name, labeller = "label_both", scales = "free")
The Bing derived features look Gaussian like.
df_all_pivot_xb <- df_all %>% select(starts_with("xb")) %>% rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xb %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = 1) + facet_wrap(~name, scales = "free")
df_all_pivot_xb %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')
df_all %>% select(starts_with("xb"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
color = as.factor(region)), size = 1.2, adjust = 1.35) +
facet_wrap(~ name, labeller = "label_both", scales = "free")
The NRC derived features look Gaussian like.
df_all_pivot_xn <- df_all %>% select(starts_with("xn")) %>% rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xn %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = 1) + facet_wrap(~name, scales = "free")
df_all_pivot_xn %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')
df_all %>% select(starts_with("xn"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
color = as.factor(region)), size = 1.2, adjust = 1.35) +
facet_wrap(~ name, labeller = "label_both", scales = "free")
The Word 01 derived feature looks Gaussian like, but the other two features don’t.
df_all_pivot_xw <- df_all %>% select(starts_with("xw")) %>% rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xw %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = 3) + facet_wrap(~name, scales = "free")
df_all_pivot_xw %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')
df_all %>% select(starts_with("xw"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
color = as.factor(region)), size = 1.2, adjust = 1.35) +
facet_wrap(~ name, labeller = "label_both", scales = "free")
df_all_pivot_xs <- df_all %>% select(starts_with("xs")) %>% rowid_to_column() %>% pivot_longer(!c("rowid"))
df_all_pivot_xs %>% ggplot(mapping=aes(x=value)) + geom_histogram(binwidth = .03) + facet_wrap(~name, scales = "free")
df_all_pivot_xs %>% ggplot(mapping=aes(x=name, y=value)) + geom_violin(fill = 'grey')
df_all %>% select(starts_with("xs"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=value)) + geom_density(mapping = aes(group = interaction(region),
color = as.factor(region)), size = 1.2, adjust = 1.35) +
facet_wrap(~ name, labeller = "label_both", scales = "free")
The ‘response’ variable doesn’t appear Gaussian, but if you log transform it the distribution is more recognizable.
df_all %>% ggplot(mapping=aes(x=response)) + geom_histogram(bins = 25)
df_all %>% ggplot(mapping=aes(x=log(response))) + geom_histogram(binwidth = .1)
Conditioned Continuous Variables
Conditioned on Region and Customer
It looks like different regions are contributing to different AFINN features. For example for xa_01, region ZZ contributes much less than the other two. However, it is the opposite for xa_06. At the same time when looking at summary stats for xa_03, region ZZ has the middle 50% in the positive compared to the rest of the regions. With regards to customer, the features seem similar except for xa_02. The summary stats for that feature vary when broken out by customer. Customer A has the largest middle 50% across the board.
# df_all %>% select(starts_with("xa"), region) %>% rowid_to_column() %>% pivot_longer(!c("rowid"))
df_graph <- df_all %>% select(starts_with("xa"), region, customer, outcome) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "outcome"))
df_graph %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("region") +
scale_color_viridis_d("region") +
theme_bw()
df_graph %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("customer") +
scale_color_viridis_d("customer") +
theme_bw()
Region YY seems like the stronger contributor for the Bing features when there is a standout. Otherwise, they are similar. Once again customer ‘A’ seems to have a large range for the middle 50%, and is the only customer that the middle 50% is completely in the positive range for all of the features. For xb_02, the first half of the customers have a dramatically higher value than the later half of customers. For many of the features, the customers all look similar. The regions don’t seem to matter as well. Only region ZZ is slightly different than some of the other regions.
df_all %>% select(starts_with("xb"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("region") +
scale_color_viridis_d("region") +
theme_bw()
df_all %>% select(starts_with("xb"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("customer") +
scale_color_viridis_d("customer") +
theme_bw()
For NRC lexicon, the densities all seem similar except for xn_01, xn_04, xn_08. Those features have less contribution from region ZZ. The NRC lexicon seems to follow the same patterns as the features above. The 2nd feature has the most variability between the regions and customers compared to the other features.
df_all %>% select(starts_with("xn"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("region") +
scale_color_viridis_d("region") +
theme_bw()
df_all %>% select(starts_with("xn"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("customer") +
scale_color_viridis_d("customer") +
theme_bw()
For the Word lexicon, we have very unique distributions. xw_02 and xw_03 don’t appear Gaussian, and region ZZ isn’t a strong contributor. Region ZZ has the widest range for the middle 50%, and has a dramatically different median.
df_all %>% select(starts_with("xw"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("region") +
scale_color_viridis_d("region") +
theme_bw()
df_all %>% select(starts_with("xw"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("customer") +
scale_color_viridis_d("customer") +
theme_bw()
For sentimentr derived features, region ZZ is a strong contributor to the density compared to the other features. Once again region ZZ stands out for the 2nd feature, and the last feature. This is something to note later when fitting models.
df_all %>% select(starts_with("xs"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(region), color= as.factor(region)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("region") +
scale_color_viridis_d("region") +
theme_bw()
df_all %>% select(starts_with("xs"), region, customer) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer")) %>%
ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(customer), color= as.factor(customer)), alpha=0.35) + facet_wrap(~ name, labeller = "label_both", scales = "free") +
scale_fill_viridis_d("customer") +
scale_color_viridis_d("customer") +
theme_bw()
Conditioned on Outcome
Feature 2 has the widest range that includes negative and positive values with the widest middle 50%. But as we know above, it may depend on different regions or customers. It also has the ‘largest’ outliers.
df_outcome <- df_all %>% select(starts_with("xa"), region, customer, outcome) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer","outcome"))
df_outcome %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35) +
theme_bw()
df_outcome %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_wrap(~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( region ~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( customer ~ name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
For the bing features, the 3rd one seems the most likely to have a negative or low value for the sentiment score. Features 2 and 3 have the largest middle 50% as well. However, it is important to note that the median for all of the features are similar between the event vs non-event.
df_outcome_b <- df_all %>% select(starts_with("xb"), region, customer, outcome) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "outcome"))
df_outcome_b %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35) +
theme_bw()
df_outcome_b %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_wrap(~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome_b %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( region ~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome_b %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( customer ~ name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
Once again the second feature has the widest middle 50%, with the largest outliers. The median for the non-event is well outside the middle 50% for the event on features 1, 2, 3. THe NRC lexicon also has the non_event values much higher across the board compared to the event values.
df_outcome_n <- df_all %>% select(starts_with("xn"), region, customer, outcome) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "outcome"))
df_outcome_n %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35) +
theme_bw()
df_outcome_n %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_wrap(~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome_n %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( region ~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome_n %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( customer ~ name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
It is interesting to see that for the Word count derived features, the summary statistics of event vs non_event are about the same.
df_outcome_w <- df_all %>% select(starts_with("xw"), region, customer, outcome) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "outcome"))
df_outcome_w %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35) +
theme_bw()
df_outcome_w %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_wrap(~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome_w %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( region ~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome_w %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( customer ~ name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
For the sentimentr derived features the 3rd vs the 2nd has the largest middle 50% and some very extreme outliers on the postive side. However, the 2nd feature has larger outliers on the negative side. The medians for all of the features for event vs non_event are all about equal as well.
df_outcome_s <- df_all %>% select(starts_with("xs"), region, customer, outcome) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "outcome"))
df_outcome_s %>% ggplot(mapping = aes(x=as.factor(name), y=value)) + geom_boxplot(mapping=aes(fill= as.factor(outcome), color= as.factor(outcome)), alpha=0.35) +
theme_bw()
df_outcome_s %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_wrap(~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome_s %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( region ~name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
df_outcome_s %>% ggplot(mapping = aes(x=value)) +
geom_freqpoly(size = 1.2, bins = 21, mapping = aes(color = outcome, y = stat(density))) +
facet_grid( customer ~ name, scales = 'free') +
scale_color_brewer(palette = 'Set1')
Correlation
corrplot::corrplot(df_all %>% select(starts_with("x")) %>% cor(), type='upper', method='square')
corrplot::corrplot(df_all %>% select(starts_with("xa")) %>% cor(), type='upper', method='square')
corrplot::corrplot(df_all %>% select(starts_with("xb")) %>% cor(), type='upper', method='square')
corrplot::corrplot(df_all %>% select(starts_with("xn")) %>% cor(), type='upper', method='square')
corrplot::corrplot(df_all %>% select(starts_with("xw")) %>% cor(), type='upper', method='square')
corrplot::corrplot(df_all %>% select(starts_with("xs")) %>% cor(), type='upper', method='square')
Input to Output Relationships
df_dense_a <- df_all %>% select(starts_with("xa"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response"))
df_all %>% select(starts_with("xa"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>%
ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')
df_dense_a %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
#geom_density(mapping = aes(group = interaction(region), color = as.factor(region)), size = 1.2, adjust = 1.35) +
#facet_grid( region ~ name, labeller = "label_both", scales = "free", space = "free")
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_a %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_a %>% ggplot(mapping = aes(x=value, y=log10(response))) +
geom_smooth( mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_b <- df_all %>% select(starts_with("xb"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response"))
df_all %>% select(starts_with("xb"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>%
ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')
df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
#geom_density(mapping = aes(group = interaction(region), color = as.factor(region)), size = 1.2, adjust = 1.35) +
#facet_grid( region ~ name, labeller = "label_both", scales = "free", space = "free")
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth( mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_b %>% ggplot(mapping = aes(x=value, y=log(response))) +
geom_smooth( mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_n <- df_all %>% select(starts_with("xn"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response"))
df_all %>% select(starts_with("xn"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>%
ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')
df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth( mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_n %>% ggplot(mapping = aes(x=value, y=log(response))) +
geom_smooth( mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_w <- df_all %>% select(starts_with("xw"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response"))
df_all %>% select(starts_with("xw"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>%
ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')
df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth( mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_w %>% ggplot(mapping = aes(x=value, y=log(response))) +
geom_smooth( mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_s <- df_all %>% select(starts_with("xs"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response"))
df_all %>% select(starts_with("xs"), region, customer, response) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response")) %>%
ggplot(mapping = aes(x=value, y=log(response))) + geom_pointdensity() + facet_wrap(~name, scale='free')
df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth( mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_s %>% ggplot(mapping = aes(x=value, y=log(response))) +
geom_smooth( mapping = aes(color = region)) +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_a %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_a %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_a %>% ggplot(mapping = aes(x=value, y=log10(response))) +
geom_smooth( mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_b %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_b %>% ggplot(mapping = aes(x=value, y=log10(response))) +
geom_smooth( mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_n %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_n %>% ggplot(mapping = aes(x=value, y=log10(response))) +
geom_smooth( mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_w %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_w %>% ggplot(mapping = aes(x=value, y=log10(response))) +
geom_smooth( mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) +
geom_point(mapping = aes(color = region, alpha=.1)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_s %>% ggplot(mapping = aes(x=value, y=response)) +
geom_smooth(method = lm, mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using formula 'y ~ x'
df_dense_s %>% ggplot(mapping = aes(x=value, y=log10(response))) +
geom_smooth( mapping = aes(color = customer)) +
scale_color_viridis_d("customer", option = 'inferno') +
facet_wrap( ~ name, labeller = "label_both", scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Continuous Input and Binary Output
df_y <- df_all %>% mutate(y = ifelse(outcome == "event", 1, 0))
df_y %>% select(starts_with("xa"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xa"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')
df_y %>% select(starts_with("xb"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xb"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')
df_y %>% select(starts_with("xn"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xn"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')
df_y %>% select(starts_with("xw"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xw"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')
df_y %>% select(starts_with("xs"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
geom_jitter(height = 0.04) +
facet_grid(region~name, scales = 'free')
df_y %>% select(starts_with("xs"), region, customer, response, y) %>% rowid_to_column() %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>%
ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
geom_jitter(height = 0.04) +
facet_grid(customer~name, scales = 'free')